"""espn 网页数据爬取"""

from requests_html import HTMLSession
from pprint import pprint
from collections import OrderedDict
import re


def get_team(imgurl):
    return re.findall(r'.*(?<=/)([^\.]+)(?=\.png)', imgurl)[0]


def parse_elem(obj, _type):
    if _type == 'text':
        return getattr(obj, 'text')
    elif _type == 'src':
        return getattr(obj, 'attrs')['src']


def play_by_play(_id):
    url = 'http://www.espn.com/nba/playbyplay?gameId=' + str(_id)
    session = HTMLSession()
    r = session.get(url)

    info_sel_fmt = [
        ('time', '#gp-quarter-%d > table > tr > td.time-stamp', 'text'),
        ('logo', '#gp-quarter-%d > table > tr > td.logo > img', 'src'),
        ('detail', '#gp-quarter-%d > table > tr > td.game-details', 'text'),
        ('score', '#gp-quarter-%d > table > tr > td.combined-score', 'text'),
    ]
    infos = [i[0] for i in info_sel_fmt]

    table = dict()
    for info, sel_fmt, _type in info_sel_fmt:
        table[info] = []
        selectors = [sel_fmt % q for q in range(1, 5)]
        for sel in selectors:
            table[info].extend([parse_elem(t, _type) for t in r.html.find(sel)])

    table['logo'] = [get_team(u) for u in table['logo']]

    for t, l, d, s in zip(*[table[info] for info in infos]):
        print(' | '.join([t, l, d, s]))


def main():
    _id = 401070218
    play_by_play(_id)


if __name__ == '__main__':
    main()
